# Author: Stephen Situ
# This is a practice project on cleaning data using panda dataframes in python on jupyter
# The original dataset can be found here: https://www.kaggle.com/datasets/gagandeep16/car-sales
# Import Libraries
import numpy as np
import pandas as pd
import matplotlib
# Read Data
car_sales_data = pd.read_csv("Car_sales.csv")
# Shape of Data
car_sales_data.shape
(157, 16)
# Variable Types
car_sales_data.info
<bound method DataFrame.info of Manufacturer Model Sales_in_thousands __year_resale_value \
0 Acura Integra 16.919 16.360
1 Acura TL 39.384 19.875
2 Acura CL 14.114 18.225
3 Acura RL 8.588 29.725
4 Audi A4 20.397 22.255
.. ... ... ... ...
152 Volvo V40 3.545 NaN
153 Volvo S70 15.245 NaN
154 Volvo V70 17.531 NaN
155 Volvo C70 3.493 NaN
156 Volvo S80 18.969 NaN
Vehicle_type Price_in_thousands Engine_size Horsepower Wheelbase \
0 Passenger 21.50 1.8 140.0 101.2
1 Passenger 28.40 3.2 225.0 108.1
2 Passenger NaN 3.2 225.0 106.9
3 Passenger 42.00 3.5 210.0 114.6
4 Passenger 23.99 1.8 150.0 102.6
.. ... ... ... ... ...
152 Passenger 24.40 1.9 160.0 100.5
153 Passenger 27.50 2.4 168.0 104.9
154 Passenger 28.80 2.4 168.0 104.9
155 Passenger 45.50 2.3 236.0 104.9
156 Passenger 36.00 2.9 201.0 109.9
Width Length Curb_weight Fuel_capacity Fuel_efficiency Latest_Launch \
0 67.3 172.4 2.639 13.2 28.0 2/2/2012
1 70.3 192.9 3.517 17.2 25.0 6/3/2011
2 70.6 192.0 3.470 17.2 26.0 1/4/2012
3 71.4 196.6 3.850 18.0 22.0 3/10/2011
4 68.2 178.0 2.998 16.4 27.0 10/8/2011
.. ... ... ... ... ... ...
152 67.6 176.6 3.042 15.8 25.0 9/21/2011
153 69.3 185.9 3.208 17.9 25.0 11/24/2012
154 69.3 186.2 3.259 17.9 25.0 6/25/2011
155 71.5 185.7 3.601 18.5 23.0 4/26/2011
156 72.1 189.8 3.600 21.1 24.0 11/14/2011
Power_perf_factor
0 58.280150
1 91.370778
2 NaN
3 91.389779
4 62.777639
.. ...
152 66.498812
153 70.654495
154 71.155978
155 101.623357
156 85.735655
[157 rows x 16 columns]>
# DataFrame Heads
car_sales_data.head(5)
| Manufacturer | Model | Sales_in_thousands | __year_resale_value | Vehicle_type | Price_in_thousands | Engine_size | Horsepower | Wheelbase | Width | Length | Curb_weight | Fuel_capacity | Fuel_efficiency | Latest_Launch | Power_perf_factor | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Acura | Integra | 16.919 | 16.360 | Passenger | 21.50 | 1.8 | 140.0 | 101.2 | 67.3 | 172.4 | 2.639 | 13.2 | 28.0 | 2/2/2012 | 58.280150 |
| 1 | Acura | TL | 39.384 | 19.875 | Passenger | 28.40 | 3.2 | 225.0 | 108.1 | 70.3 | 192.9 | 3.517 | 17.2 | 25.0 | 6/3/2011 | 91.370778 |
| 2 | Acura | CL | 14.114 | 18.225 | Passenger | NaN | 3.2 | 225.0 | 106.9 | 70.6 | 192.0 | 3.470 | 17.2 | 26.0 | 1/4/2012 | NaN |
| 3 | Acura | RL | 8.588 | 29.725 | Passenger | 42.00 | 3.5 | 210.0 | 114.6 | 71.4 | 196.6 | 3.850 | 18.0 | 22.0 | 3/10/2011 | 91.389779 |
| 4 | Audi | A4 | 20.397 | 22.255 | Passenger | 23.99 | 1.8 | 150.0 | 102.6 | 68.2 | 178.0 | 2.998 | 16.4 | 27.0 | 10/8/2011 | 62.777639 |
# DataFrame Tails
car_sales_data.tail(5)
| Manufacturer | Model | Sales_in_thousands | __year_resale_value | Vehicle_type | Price_in_thousands | Engine_size | Horsepower | Wheelbase | Width | Length | Curb_weight | Fuel_capacity | Fuel_efficiency | Latest_Launch | Power_perf_factor | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 152 | Volvo | V40 | 3.545 | NaN | Passenger | 24.4 | 1.9 | 160.0 | 100.5 | 67.6 | 176.6 | 3.042 | 15.8 | 25.0 | 9/21/2011 | 66.498812 |
| 153 | Volvo | S70 | 15.245 | NaN | Passenger | 27.5 | 2.4 | 168.0 | 104.9 | 69.3 | 185.9 | 3.208 | 17.9 | 25.0 | 11/24/2012 | 70.654495 |
| 154 | Volvo | V70 | 17.531 | NaN | Passenger | 28.8 | 2.4 | 168.0 | 104.9 | 69.3 | 186.2 | 3.259 | 17.9 | 25.0 | 6/25/2011 | 71.155978 |
| 155 | Volvo | C70 | 3.493 | NaN | Passenger | 45.5 | 2.3 | 236.0 | 104.9 | 71.5 | 185.7 | 3.601 | 18.5 | 23.0 | 4/26/2011 | 101.623357 |
| 156 | Volvo | S80 | 18.969 | NaN | Passenger | 36.0 | 2.9 | 201.0 | 109.9 | 72.1 | 189.8 | 3.600 | 21.1 | 24.0 | 11/14/2011 | 85.735655 |
# Summary / Descriptive Statistics
car_sales_data.describe
<bound method NDFrame.describe of Manufacturer Model Sales_in_thousands __year_resale_value \
0 Acura Integra 16.919 16.360
1 Acura TL 39.384 19.875
2 Acura CL 14.114 18.225
3 Acura RL 8.588 29.725
4 Audi A4 20.397 22.255
.. ... ... ... ...
152 Volvo V40 3.545 NaN
153 Volvo S70 15.245 NaN
154 Volvo V70 17.531 NaN
155 Volvo C70 3.493 NaN
156 Volvo S80 18.969 NaN
Vehicle_type Price_in_thousands Engine_size Horsepower Wheelbase \
0 Passenger 21.50 1.8 140.0 101.2
1 Passenger 28.40 3.2 225.0 108.1
2 Passenger NaN 3.2 225.0 106.9
3 Passenger 42.00 3.5 210.0 114.6
4 Passenger 23.99 1.8 150.0 102.6
.. ... ... ... ... ...
152 Passenger 24.40 1.9 160.0 100.5
153 Passenger 27.50 2.4 168.0 104.9
154 Passenger 28.80 2.4 168.0 104.9
155 Passenger 45.50 2.3 236.0 104.9
156 Passenger 36.00 2.9 201.0 109.9
Width Length Curb_weight Fuel_capacity Fuel_efficiency Latest_Launch \
0 67.3 172.4 2.639 13.2 28.0 2/2/2012
1 70.3 192.9 3.517 17.2 25.0 6/3/2011
2 70.6 192.0 3.470 17.2 26.0 1/4/2012
3 71.4 196.6 3.850 18.0 22.0 3/10/2011
4 68.2 178.0 2.998 16.4 27.0 10/8/2011
.. ... ... ... ... ... ...
152 67.6 176.6 3.042 15.8 25.0 9/21/2011
153 69.3 185.9 3.208 17.9 25.0 11/24/2012
154 69.3 186.2 3.259 17.9 25.0 6/25/2011
155 71.5 185.7 3.601 18.5 23.0 4/26/2011
156 72.1 189.8 3.600 21.1 24.0 11/14/2011
Power_perf_factor
0 58.280150
1 91.370778
2 NaN
3 91.389779
4 62.777639
.. ...
152 66.498812
153 70.654495
154 71.155978
155 101.623357
156 85.735655
[157 rows x 16 columns]>
# Check Data Types
car_sales_data.dtypes
Manufacturer object Model object Sales_in_thousands float64 __year_resale_value float64 Vehicle_type object Price_in_thousands float64 Engine_size float64 Horsepower float64 Wheelbase float64 Width float64 Length float64 Curb_weight float64 Fuel_capacity float64 Fuel_efficiency float64 Latest_Launch object Power_perf_factor float64 dtype: object
# Create dataframe of only categorical columns
categorical = car_sales_data.dtypes[car_sales_data.dtypes == "object"].index
print(categorical)
car_sales_data[categorical]
Index(['Manufacturer', 'Model', 'Vehicle_type', 'Latest_Launch'], dtype='object')
| Manufacturer | Model | Vehicle_type | Latest_Launch | |
|---|---|---|---|---|
| 0 | Acura | Integra | Passenger | 2/2/2012 |
| 1 | Acura | TL | Passenger | 6/3/2011 |
| 2 | Acura | CL | Passenger | 1/4/2012 |
| 3 | Acura | RL | Passenger | 3/10/2011 |
| 4 | Audi | A4 | Passenger | 10/8/2011 |
| ... | ... | ... | ... | ... |
| 152 | Volvo | V40 | Passenger | 9/21/2011 |
| 153 | Volvo | S70 | Passenger | 11/24/2012 |
| 154 | Volvo | V70 | Passenger | 6/25/2011 |
| 155 | Volvo | C70 | Passenger | 4/26/2011 |
| 156 | Volvo | S80 | Passenger | 11/14/2011 |
157 rows × 4 columns
# Sorting the first 15 items in Model
sorted(car_sales_data["Model"])[0:15]
['3-Sep', '3000GT', '300M', '323i', '328i', '4Runner', '5-Sep', '528i', 'A4', 'A6', 'A8', 'Accent', 'Accord', 'Alero', 'Altima']
# Describe only Manufacterer column
car_sales_data["Manufacturer"].describe
<bound method NDFrame.describe of 0 Acura
1 Acura
2 Acura
3 Acura
4 Audi
...
152 Volvo
153 Volvo
154 Volvo
155 Volvo
156 Volvo
Name: Manufacturer, Length: 157, dtype: object>
# Check Unique Values in Model
car_sales_data["Model"].unique()
array(['Integra', 'TL', 'CL', 'RL', 'A4', 'A6', 'A8', '323i', '328i',
'528i', 'Century', 'Regal', 'Park Avenue', 'LeSabre', 'DeVille',
'Seville', 'Eldorado', 'Catera', 'Escalade', 'Cavalier', 'Malibu',
'Lumina', 'Monte Carlo', 'Camaro', 'Corvette', 'Prizm', 'Metro',
'Impala', 'Sebring Coupe', 'Sebring Conv.', 'Concorde', 'Cirrus',
'LHS', 'Town & Country', '300M', 'Neon', 'Avenger', 'Stratus',
'Intrepid', 'Viper', 'Ram Pickup', 'Ram Wagon', 'Ram Van',
'Dakota', 'Durango', 'Caravan', 'Escort', 'Mustang', 'Contour',
'Taurus', 'Focus', 'Crown Victoria', 'Explorer', 'Windstar',
'Expedition', 'Ranger', 'F-Series', 'Civic', 'Accord', 'CR-V',
'Passport', 'Odyssey', 'Accent', 'Elantra', 'Sonata', 'I30',
'S-Type', 'Wrangler', 'Cherokee', 'Grand Cherokee', 'ES300',
'GS300', 'GS400', 'LS400', 'LX470', 'RX300', 'Continental',
'Town car', 'Navigator', 'Mirage', 'Eclipse', 'Galant', 'Diamante',
'3000GT', 'Montero', 'Montero Sport', 'Mystique', 'Cougar',
'Sable', 'Grand Marquis', 'Mountaineer', 'Villager', 'C-Class',
'E-Class', 'S-Class', 'SL-Class', 'SLK', 'SLK230', 'CLK Coupe',
'CL500', 'M-Class', 'Sentra', 'Altima', 'Maxima', 'Quest',
'Pathfinder', 'Xterra', 'Frontier', 'Cutlass', 'Intrigue', 'Alero',
'Aurora', 'Bravada', 'Silhouette', 'Breeze', 'Voyager', 'Prowler',
'Sunfire', 'Grand Am', 'Firebird', 'Grand Prix', 'Bonneville',
'Montana', 'Boxter', 'Carrera Coupe', 'Carrera Cabrio', '5-Sep',
'3-Sep', 'SL', 'SC', 'SW', 'LW', 'LS', 'Outback', 'Forester',
'Corolla', 'Camry', 'Avalon', 'Celica', 'Tacoma', 'Sienna', 'RAV4',
'4Runner', 'Land Cruiser', 'Golf', 'Jetta', 'Passat', 'Cabrio',
'GTI', 'Beetle', 'S40', 'V40', 'S70', 'V70', 'C70', 'S80'],
dtype=object)
# Find Nulls
nan_rows = car_sales_data[car_sales_data.isna().any(axis=1)]
print(nan_rows)
nan_rows.index
Manufacturer Model Sales_in_thousands __year_resale_value \
2 Acura CL 14.114 18.225
7 BMW 323i 19.747 NaN
15 Cadillac Seville 15.943 27.100
18 Cadillac Escalade 14.785 NaN
27 Chevrolet Impala 107.995 NaN
33 Chrysler Town & Country 53.480 19.540
34 Chrysler 300M 30.696 NaN
38 Dodge Intrepid 88.028 12.275
44 Dodge Durango 101.323 NaN
50 Ford Focus 175.670 NaN
66 Jaguar S-Type 15.467 NaN
72 Lexus GS400 3.334 NaN
74 Lexus LX470 9.126 NaN
75 Lexus RX300 51.238 NaN
78 Lincoln Navigator 22.925 NaN
96 Mercedes-B SLK 7.998 NaN
97 Mercedes-B SLK230 1.526 NaN
98 Mercedes-B CLK Coupe 11.592 NaN
99 Mercedes-B CL500 0.954 NaN
100 Mercedes-B M-Class 28.976 NaN
106 Nissan Xterra 54.158 NaN
107 Nissan Frontier 65.005 NaN
109 Oldsmobile Intrigue 38.554 NaN
110 Oldsmobile Alero 80.255 NaN
117 Plymouth Prowler 1.872 NaN
123 Pontiac Montana 39.572 NaN
127 Saab 5-Sep 9.191 NaN
128 Saab 3-Sep 12.115 NaN
132 Saturn LW 8.472 NaN
133 Saturn LS 49.989 NaN
134 Subaru Outback 47.107 NaN
135 Subaru Forester 33.028 NaN
141 Toyota Sienna 65.119 NaN
150 Volkswagen Beetle 49.463 NaN
151 Volvo S40 16.957 NaN
152 Volvo V40 3.545 NaN
153 Volvo S70 15.245 NaN
154 Volvo V70 17.531 NaN
155 Volvo C70 3.493 NaN
156 Volvo S80 18.969 NaN
Vehicle_type Price_in_thousands Engine_size Horsepower Wheelbase \
2 Passenger NaN 3.2 225.0 106.9
7 Passenger 26.990 2.5 170.0 107.3
15 Passenger 44.475 4.6 275.0 112.2
18 Car 46.225 5.7 255.0 117.5
27 Passenger 18.890 3.4 180.0 110.5
33 Car NaN NaN NaN NaN
34 Passenger 29.185 3.5 253.0 113.0
38 Passenger 22.505 2.7 202.0 113.0
44 Car 26.310 5.2 230.0 115.7
50 Passenger 12.315 2.0 107.0 103.0
66 Passenger 42.800 3.0 240.0 114.5
72 Passenger 46.305 4.0 300.0 110.2
74 Car 60.105 4.7 230.0 112.2
75 Car 34.605 3.0 220.0 103.0
78 Car 42.660 5.4 300.0 119.0
96 Passenger 38.900 2.3 190.0 94.5
97 Passenger 41.000 2.3 185.0 94.5
98 Passenger 41.600 3.2 215.0 105.9
99 Passenger 85.500 5.0 302.0 113.6
100 Car 35.300 3.2 215.0 111.0
106 Car 22.799 3.3 170.0 104.3
107 Car 17.890 3.3 170.0 116.1
109 Passenger 24.150 3.5 215.0 109.0
110 Passenger 18.270 2.4 150.0 107.0
117 Passenger 43.000 3.5 253.0 113.3
123 Car 25.635 3.4 185.0 120.0
127 Passenger 33.120 2.3 170.0 106.4
128 Passenger 26.100 2.0 185.0 102.6
132 Passenger 18.835 2.2 137.0 106.5
133 Passenger 15.010 2.2 137.0 106.5
134 Passenger 22.695 2.5 165.0 103.5
135 Car 20.095 2.5 165.0 99.4
141 Car 22.368 3.0 194.0 114.2
150 Passenger 15.900 2.0 115.0 98.9
151 Passenger 23.400 1.9 160.0 100.5
152 Passenger 24.400 1.9 160.0 100.5
153 Passenger 27.500 2.4 168.0 104.9
154 Passenger 28.800 2.4 168.0 104.9
155 Passenger 45.500 2.3 236.0 104.9
156 Passenger 36.000 2.9 201.0 109.9
Width Length Curb_weight Fuel_capacity Fuel_efficiency Latest_Launch \
2 70.6 192.0 3.470 17.2 26.0 1/4/2012
7 68.4 176.0 3.179 16.6 26.0 6/28/2011
15 75.0 201.0 NaN 18.5 22.0 4/29/2011
18 77.0 201.2 5.572 30.0 15.0 4/17/2012
27 73.0 200.0 3.389 17.0 27.0 6/18/2011
33 NaN NaN NaN NaN NaN 7/13/2011
34 74.4 197.8 3.567 17.0 23.0 2/10/2012
38 74.7 203.7 3.489 17.0 NaN 6/2/2012
44 71.7 193.5 4.394 25.0 17.0 6/27/2012
50 66.9 174.8 2.564 13.2 30.0 7/22/2012
66 71.6 191.3 3.650 18.4 21.0 11/3/2012
72 70.9 189.2 3.693 19.8 21.0 11/28/2012
74 76.4 192.5 5.401 25.4 15.0 10/30/2012
75 71.5 180.1 3.900 17.2 21.0 1/4/2012
78 79.9 204.8 5.393 30.0 15.0 12/23/2012
96 67.5 157.9 3.055 15.9 26.0 1/16/2011
97 67.5 157.3 2.975 14.0 27.0 8/6/2011
98 67.8 180.3 3.213 16.4 26.0 7/8/2011
99 73.1 196.6 4.115 23.2 20.0 4/11/2011
100 72.2 180.6 4.387 19.0 20.0 2/10/2011
106 70.4 178.0 3.821 19.4 18.0 1/24/2011
107 66.5 196.1 3.217 19.4 18.0 8/27/2011
109 73.6 195.9 3.455 18.0 NaN 4/1/2011
110 70.1 186.7 2.958 15.0 27.0 10/20/2009
117 76.3 165.4 2.850 12.0 21.0 6/27/2012
123 72.7 201.3 3.942 25.0 23.0 7/22/2012
127 70.6 189.2 3.280 18.5 23.0 11/9/2012
128 67.4 182.2 2.990 16.9 23.0 6/12/2011
132 69.0 190.4 3.075 13.1 27.0 8/5/2011
133 69.0 190.4 2.910 13.1 28.0 12/4/2012
134 67.5 185.8 3.415 16.9 25.0 7/7/2011
135 68.3 175.2 3.125 15.9 24.0 9/10/2012
141 73.4 193.5 3.759 20.9 22.0 10/5/2012
150 67.9 161.1 2.769 14.5 26.0 10/20/2011
151 67.6 176.6 2.998 15.8 25.0 2/18/2011
152 67.6 176.6 3.042 15.8 25.0 9/21/2011
153 69.3 185.9 3.208 17.9 25.0 11/24/2012
154 69.3 186.2 3.259 17.9 25.0 6/25/2011
155 71.5 185.7 3.601 18.5 23.0 4/26/2011
156 72.1 189.8 3.600 21.1 24.0 11/14/2011
Power_perf_factor
2 NaN
7 71.191207
15 115.621358
18 109.509117
27 71.838039
33 NaN
34 101.655244
38 80.831470
44 92.854125
50 43.117132
66 102.178985
72 125.013357
74 105.760458
75 91.943802
78 123.972047
96 82.807362
97 81.848969
98 92.925792
99 141.100985
100 90.495532
106 69.782944
107 67.889271
109 86.272523
110 60.727447
117 106.984456
123 76.208440
127 73.503778
128 76.023048
132 56.295243
133 54.819728
134 67.765908
135 66.762943
141 78.027219
150 47.329632
151 66.113057
152 66.498812
153 70.654495
154 71.155978
155 101.623357
156 85.735655
Int64Index([ 2, 7, 15, 18, 27, 33, 34, 38, 44, 50, 66, 72, 74,
75, 78, 96, 97, 98, 99, 100, 106, 107, 109, 110, 117, 123,
127, 128, 132, 133, 134, 135, 141, 150, 151, 152, 153, 154, 155,
156],
dtype='int64')
# Remove any rows with Nulls
car_sales_clean = car_sales_data.drop(labels=nan_rows.index,axis=0,inplace=False)
# Find Duplicates
duplicate_rows = car_sales_clean[car_sales_clean.duplicated(keep='last')]
print(duplicate_rows)
Empty DataFrame Columns: [Manufacturer, Model, Sales_in_thousands, __year_resale_value, Vehicle_type, Price_in_thousands, Engine_size, Horsepower, Wheelbase, Width, Length, Curb_weight, Fuel_capacity, Fuel_efficiency, Latest_Launch, Power_perf_factor] Index: []
# Find Null in a column
car_sales_clean[car_sales_clean[['Horsepower']].isna().any(axis=1)]
| Manufacturer | Model | Sales_in_thousands | __year_resale_value | Vehicle_type | Price_in_thousands | Engine_size | Horsepower | Wheelbase | Width | Length | Curb_weight | Fuel_capacity | Fuel_efficiency | Latest_Launch | Power_perf_factor |
|---|
# Group by Manufacturer / Model and Sales
car_sales_clean.groupby(['Manufacturer','Model'])['Sales_in_thousands'].sum()
Manufacturer Model
Acura Integra 16.919
RL 8.588
TL 39.384
Audi A4 20.397
A6 18.780
...
Volkswagen Cabrio 9.569
GTI 5.596
Golf 9.761
Jetta 83.721
Passat 51.102
Name: Sales_in_thousands, Length: 117, dtype: float64
# Save Cleaned Csv
car_sales_clean.to_csv('car_sales_clean.csv')